#include <reduce_ops.h>
template <typename T> T reduce_sum_cpu(T *h_in, const unsigned int N) {
    T sum = T();
    for (int i = 0; i < N; i++) {
        sum += h_in[i];
    }
    return sum;
}
template float reduce_sum_cpu<float>(float *, const unsigned int);
template int reduce_sum_cpu<int>(int *, const unsigned int);
